Introduction

TODO

Binomial distribution (barplots, formula notation, math. expressions)

barplot formula notation

expression

expression w/ text (paste plus space, tilde for space)

p <- 0.2
x <- 0:10
n <- 10
y <- dbinom(x, n, p)

barplot(y ~ x,
        xlab = "x", ylab = expression(P(X == x)),
        main = expression(paste("Binomial distribution ", X %~% ~ B(10, 0.2))))

substitute

p <- 0.2
x <- 0:10
n <- 10
y <- dbinom(x, n, p)

barplot(y ~ x,
        xlab = "x", ylab = expression(P(X == x)),
        main = substitute(paste("Binomial distribution ", X %~% ~ B(n, p)), list(n=n, p=p)))

Log normal distribution as example of a skewed distribution (abline() for vertical lines, legend() for text labels)

x <- seq(0, 3, by = 0.001)
unit <- 100000
m <- 0.6
s <- 0.08
y <- dlnorm(x, log(m))

y <- y/sum(y)

mu <- sum(x * y * unit)
med <- max(x[cumsum(y) <= 0.5]) * unit

addlabel <- function(x, y, label, col, xjust = 0.05) {
    legend(x, y, label, text.col = col, box.col = "white", xjust = xjust, x.intersp = 0)
}

plot(x * unit, y, type = "l",
     xlab = "Income in €", ylab = "Density",
     main = "A fictitious but typical income distribution")
abline(v = c(med, mu), lty = "dashed", col = c("red", "blue"))
addlabel(med, 0.001, paste("Median:", med, "€"), "red")
addlabel(mu, 0.0008, paste("Mean:", round(mu), "€"), "blue")

From binomial to normal distribution (multiple plots, curve(), legend())

multiple graphics

better use “stick plot” (?)

p <- 0.5

xlim <- list(
    "10" = c(0, 10),
    "100" = c(35, 65),
    "1000" = c(450, 550)
)

lwd <- list(
    "10" = 5,
    "100" = 3,
    "1000" = 1
)


lastn <- 0
for (n in c(10, 100, 1000, 1000)) {
    x <- 0:n
    y <- dbinom(x, n, p)
    k <- as.character(n)

    plot(x, y, type = "h", lwd = lwd[[k]],
         xlab = 'x – Number of "heads"', ylab = expression(P(X == x)),
         main = sprintf("Binomial distribution for %d coin flips", n),
         xlim = xlim[[k]])

    if (lastn == 1000) {
        mu <- n*p
        sigma <- n*p/sqrt(n)
        f <- function(x) { dnorm(x, mean = mu, sd = sigma) }
        curve(f, xlim[[k]][1], xlim[[k]][2], add = TRUE, col = "red")
        legend(510, 0.025, substitute(paste("Normal distribution ", N(mu, sigma)),
                                      list(mu=mu, sigma=round(sigma, 2))),
               lty = "solid", col = "red", cex = 0.75)
    }

    lastn <- n
}

Continuous uniform distribution (line segments with segments(), area under the curve with polygon())

a <- 0
b <- 10
y <- 1/(b-a)

plot(c(a, b), c(y, y), xlim = c(a-1, b+1), ylim = c(0, y*1.1), pch = 16,
     xlab = "Waiting time in minutes", ylab = "Density",
     main = "Density function of the waiting time")
segments(a-1, 0, a, 0)
segments(a, 0, a, y, lty = "dashed")
segments(a, y, b, y)
segments(b, y, b, 0, lty = "dashed")
segments(b, 0, b+1, 0)

a <- 0
b <- 10
y <- 1/(b-a)

x1 <- 7.5
x2 <- 10

plot(c(a, b), c(y, y), xlim = c(a-1, b+1), ylim = c(0, y*1.1), pch = 16,
     xlab = "Waiting time in minutes", ylab = "Density",
     main = "Density function of the waiting time")

polygon(c(x1, x1, x2, x2, x1), c(0, y, y, 0, 0), col = "lightgray", border = FALSE)

text(x1 + (x2 - x1) / 2, y/2, substitute(P(a <= ~X <= b), list(a=x1, b=x2)), cex = 0.75)

segments(a-1, 0, a, 0)
segments(a, 0, a, y, lty = "dashed")
segments(a, y, b, y)
segments(b, y, b, 0, lty = "dashed")
segments(b, 0, b+1, 0)

Normal distribution

Normal distributions with different parameters (“empty” plots, styles, legends)

param <- list(
    c(0, 1),
    c(0, 0.5),
    c(0, 2),
    c(2, 1),
    c(-1, 0.4)
)

styles <- list(
    c("solid", "darkred"),
    c("solid", "coral"),
    c("solid", "red"),
    c("dashed", "darkred"),
    c("dotted", "orange")
)

for (maxshow in 1:length(param)) {
    plot(NULL, xlim = c(-5, 5), ylim = c(0, 1.0),
         main = "Normal distributions",
         xlab = "x",
         ylab = "f(x)")

    legend_labels <- character()
    legend_lty <- character()
    legend_col <- character()
    for (i in 1:maxshow) {
        p <- param[[i]]
        m <- p[1]
        s <- p[2]
        k <- sprintf("m%.1f_s%.1f", m, s)

        linest <- styles[[i]]

        f <- function(x) { dnorm(x, mean = m, sd = s) }
        curve(f, lwd = 3, lty = linest[1], col = linest[2], n = 1001, add = TRUE)

        legend_labels <- append(legend_labels, sprintf("N(%.1f, %.1f)", m , s))
        legend_lty <- append(legend_lty, linest[1])
        legend_col <- append(legend_col, linest[2])
    }

    legend("topright", legend_labels, lty = legend_lty, col = legend_col)
}

Standard normal distribution (curve())

curve(dnorm, lwd = 2, n = 1001, from = -3, to = 3,
      main = "Standard normal distribution N(0, 1)",
      xlab = "Standard unit z",
      ylab = "f(z)")

Example of a normal distribution (curve())

f <- function(x) { dnorm(x, mean = 10, sd = 8) }
curve(f, lwd = 2, n = 1001, from = -15, to = 35,
      main = "Temperature normally distrubted as N(10°C, 8°C)",
      xlab = "Temperature x in °C",
      ylab = "f(x)")

Standard normal distribution: areas under the curve (area under the curve with polygon(), annotations with arrows())

x <- seq(-3.5, 3.5, length = 1001)
y <- dnorm(x)
plot(x, y, ylim = c(0, 0.6), type = "l",
     xlab = "z", ylab = "f(z)", main = "Standard normal distribution N(0, 1)")

for (z in 1:3) {
    iv <- x >= -z & x <= z
    polygon(c(x[iv][1], x[iv], max(x[iv])),
            c(0, y[iv], 0),
            col = "#aaaaaa50", border = FALSE)
    abline(v = c(-z, z), lty = "dashed", col = paste0("#000000", c("FF", "AA", "66")[z]))
    y_segm <- 0.38 + 0.055 * z
    arrows(-z, y_segm, z, y_segm, code = 3, length = 0.1)
    text(0, y_segm, paste0(round((1-pnorm(-z) * 2) * 100, 1), "%"), adj = c(0.5, -0.2))
}

Normal distribution (math. expressions)

x <- seq(-3.5, 3.5, length = 1001)
y <- dnorm(x)
plot(x, y, ylim = c(0, 0.6), type = "l",
     xlab = "x", ylab = "f(x)",
     main = expression(paste("Normal distribution ", N(mu, sigma))),
     xaxt = "n")

for (z in 1:3) {
    iv <- x >= -z & x <= z
    polygon(c(x[iv][1], x[iv], max(x[iv])),
            c(0, y[iv], 0),
            col = "#aaaaaa50", border = FALSE)
    abline(v = c(-z, z), lty = "dashed", col = paste0("#000000", c("FF", "AA", "66")[z]))
    y_segm <- 0.38 + 0.055 * z
    arrows(-z, y_segm, z, y_segm, code = 3, length = 0.1)
    text(0, y_segm, paste0(round((1-pnorm(-z) * 2) * 100, 1), "%"), adj = c(0.5, -0.2))
}

xticks <- -3:3
axis(1, at = -3:3, labels = expression(mu - 3 * sigma,
                                       mu - 2 * sigma,
                                       mu - sigma,
                                       mu,
                                       mu + sigma,
                                       mu + 2 * sigma,
                                       mu + 3 * sigma))

Density function, distribution function, quantile function and random number generator (plotting random points under a curve)

x <- seq(-4, 4, length = 1000)
y <- dnorm(x)
plot(x, y, type = "l",
     main = "Density function f(x) of the standard normal distribution:\ndnorm(x)",
     xlab = "x", ylab = "f(x)")

x <- seq(-4, 4, length = 1000)
y <- pnorm(x)
plot(x, y, type = "l",
     main = "Distribution function F(x) of the standard normal distribution:\npnorm(x)",
     xlab = "x", ylab = "F(x)")

x <- seq(-4, 4, length = 1000)
y <- pnorm(x)
plot(x, y, type = "l",
     main = "Distribution function F(x) of the standard normal distribution:\npnorm(x)",
     xlab = "x", ylab = "F(x)")
p <- 0.9
q <- qnorm(0.9)
segments(-4, p, q, p, lty = "dashed", col = "red")
segments(q, p, q, 0, lty = "dashed", col = "red")
text(-3.5, p, paste("F(x) =", p), pos = 3, col = "red")
text(q, 0, paste("For which x?"), pos = 4, col = "red")

x <- seq(0, 1, length = 1000)
y <- qnorm(x)
plot(x, y, type = "l",
     main = expression(paste("Quantile function ", F^{-1} * (x), " of the standard normal distribution: qnorm(x)")),
     xlab = "x", ylab = expression(F^{-1} * (x)))

set.seed(20231206)
x <- rnorm(1000)
n <- length(x)
d <- density(x)

# for each generated x_i, find the corresponding value of the density curve; this allows to generate a random
# y-coordinate that fits under the density curve
max_y <- sapply(x, function(x_i) {
    d$y[min(which(x_i < d$x))]
})

plot(d, main = paste(n, "randomly sampled values from the standard normal distribution"),
     xlab = "x", ylab = "Density")
points(x, y = runif(n, 0, max_y), col = rgb(0, 0, 0, 0.5))

Sampling distributions and central limit theorem

Visualizing different sample draws and estimation errors (dotchart(), math. expressions, legends)

set.seed(20231024)

d <- read.table('data/miete03.asc', header = TRUE)

rent <- d$nm
mu <- mean(rent)

n <- length(rent)

hist(rent, main = "Histogram of the rent", freq = FALSE,
     sub = paste("n =", n), xlab = "Rent in €", ylab = "Probability")
abline(v = mu, lty = 'dashed', col = 'red')
text(mu, 0.0017, expression(mu), adj = c(-0.5, 0), col = 'red')

n <- 30
m <- 10

hat_mu_i <- sapply(1:m, function(i) {
    mean(sample(rent, n))
})

dotchart(hat_mu_i, labels = as.character(1:m), xlab = "Rent in €", ylab = "Sample",
         main = substitute(
             paste("Sample means ", hat(mu)[i], " for ", m, " samples of size n=", n),
             list(m=m, n=n)
        ))
abline(v = mu, lty = 'dashed', col = 'red')
text(mu, 0.5, expression(mu), adj = c(-0.5, 0), col = 'red')
mean_hat_mu <- mean(hat_mu_i)
abline(v = mean_hat_mu, lty = 'dashed', col = 'blue')
text(mean_hat_mu, 0.5, expression(bar(hat(mu))), adj = c(-0.5, 0), col = 'blue')

n <- 30
m <- 10000

hat_mu_i <- sapply(1:m, function(i) {
    mean(sample(rent, n))
})

hist(hat_mu_i, freq = FALSE, xlab = "Rent in €", ylab = "Probability",
     main = substitute(
         paste("Histogram of sample means for ", m, " samples of size n=", n),
         list(m=m, n=n)
     )
)

abline(v = mu, lty = "solid", col = 'red')
text(mu, 0, expression(mu), adj = c(-0.5, 0), col = 'red')
mean_hat_mu <- mean(hat_mu_i)
abline(v = mean_hat_mu, lty = 'dashed', col = 'blue')
text(mean_hat_mu, 0.0005, expression(bar(hat(mu))), adj = c(-0.5, 0), col = 'blue')

e <- round(abs(mean_hat_mu - mu), 2)
text(mean_hat_mu, 0.0085, expression(abs(paste(" ", bar(hat(mu)) - mu, " "))), adj = c(-1, 0))
text(mean_hat_mu, 0.0085, paste0("= ", e, "€"), adj = c(-2.2, -0.2))

m <- 10000
se_per_samplesize <- numeric()
samplesizes <- integer()

for (n in c(15, 30, 100, 1000)) {
    hat_mu_i <- sapply(1:m, function(i) {
        mean(sample(rent, n))
    })

    hist(hat_mu_i, freq = FALSE, xlab = "Rent in €", ylab = "Probability",
         main = substitute(
             paste("Histogram of sample means for ", m, " samples of size n=", n),
             list(m=m, n=n)
         ),
         breaks = seq(200, 940, by = 5)
    )

    se_per_samplesize <- c(se_per_samplesize, sd(hat_mu_i))
    samplesizes <- c(samplesizes, n)
}

plot(samplesizes, se_per_samplesize, type = "p",
     main = "Estimation error of the rent by sample size",
     xlab = "Sample size",
     ylab = "Standard error in €")
nrange <- min(samplesizes):max(samplesizes)
se_theoretic <- sd(rent) / sqrt(nrange)
lines(nrange, se_theoretic, lty = 'dashed', col = 'red')
legend("topright", c("Empirical standard error", "Theoretic standard error"),
       pch = c(1, NA_integer_), lty = c(NA_integer_, 2), col = c("black", "red"))

Distribution of the sample mean \(\overline X\) (math. expressions)

x <- seq(-3.5, 3.5, length = 1001)
y <- dnorm(x)
plot(x, y, ylim = c(0, 0.6), type = "l",
     xlab = "x", ylab = "f(x)",
     main = expression(paste("Distribution of the sample mean ", bar(X) %~% ~ N(mu, sigma[bar(X)]))),
     xaxt = "n")

for (z in 1:3) {
    iv <- x >= -z & x <= z
    polygon(c(x[iv][1], x[iv], max(x[iv])),
            c(0, y[iv], 0),
            col = "#aaaaaa50", border = FALSE)
    abline(v = c(-z, z), lty = "dashed", col = paste0("#000000", c("FF", "AA", "66")[z]))
    y_segm <- 0.38 + 0.055 * z
    arrows(-z, y_segm, z, y_segm, code = 3, length = 0.1)
    text(0, y_segm, paste0(round((1-pnorm(-z) * 2) * 100, 1), "%"), adj = c(0.5, -0.2))
}

xticks <- -3:3
axis(1, at = -3:3, labels = expression(mu - 3 * sigma[bar(X)],
                                       mu - 2 * sigma[bar(X)],
                                       mu - sigma[bar(X)],
                                       mu,
                                       mu + sigma[bar(X)],
                                       mu + 2 * sigma[bar(X)],
                                       mu + 3 * sigma[bar(X)]))

barplot(rep(1, 6) / 6, ylim = c(0, 1), names.arg = 1:6,
        main = "Probability distribution for a fair die",
        xlab = "Spots", ylab = "Probability")

set.seed(20231106)
m <- 10000

for (n in c(10, 30, 100)) {
    means <- sapply(1:m, function(i) {
        X <- sample(1:6, 10, replace = TRUE)
        mean(X)
    })
    
    hist(means, probability = TRUE, breaks = seq(1, 6, by = 0.1),
         main = paste("Distribution of the sample mean for the number of spots with n =", n),
         xlab = "Sample mean for the number of spots",
         ylab = "Density")
}

unit <- 100000
mean <- 0.6
x <- seq(0, 3, by = 0.01)
d <- dlnorm(x, log(mean)) * unit
d <- d[d < 100000]

hist(d, breaks = seq(0, 1.2, by = 0.1) * unit, probability = TRUE,
     main = "Fictitious income distribution",
     xlab = "Income in €",
     ylab = "Density")

m <- 10000

for (n in c(10, 100, 1000)) {
    means <- sapply(1:m, function(i) {
        X <- rlnorm(n, log(mean)) * unit
        mean(X)
    })
    means <- means[means < 250000]
    
    hist(means, probability = TRUE, breaks = seq(0, 2.5, by = 0.02) * unit,
         main = paste("Distribution of the sample mean of incomes with n =", n),
         xlab = "Sample mean of incomes",
         ylab = "Density")
}

Confidence intervals

set.seed(20231111)

d <- read.table('data/miete03.asc', header = TRUE)

rent <- d$nm
mu <- mean(rent)
sigma <- sqrt(mean((mu-rent)^2))

m <- 100   # number of draws
n <- 30    # sample size

sample_conf_interval <- function(i, data, n, sigma, z) {
    X <- sample(data, size = n)
    Xbar <- mean(X)
    E <- z * sigma/sqrt(n)
    c(Xbar - E, Xbar + E)
}


for (gamma in c(0.8, 0.95, 0.99)) {   # confidence levels
    alpha <- 1 - gamma    # significance level

    z <- qnorm(1-alpha/2)

    conf_ints <- t(sapply(1:m, sample_conf_interval, rent, n, sigma, z))

    plot(NULL, xlim = c(300, 850), ylim = c(1, m+1),
         main = sprintf("%d%%-confidence intervals for %d random samples (n=%d)\nof the rent dataset",
                        round(gamma*100), m, n),
         xlab = "Rent in €",
         ylab = "Sample draw")
    abline(v = mu, col = "red", lty = "dashed")
    text(mu, 100, expression(mu), col = "red", adj = c(-0.5, -0.5))

    mu_covered <- conf_ints[,1] < mu &  mu < conf_ints[,2]
    conf_int_col <- ifelse(mu_covered, "black", "red")
    segments(conf_ints[,1], 1:m, conf_ints[,2], 1:m, col = conf_int_col)
    legend("topright", c(expression(paste("Confidence interval covers ", mu)),
                         expression(paste("Confidence interval doesn't cover ", mu))),
           lty = "solid",
           col = c("black", "red"))
}

Student’s t distribution

curve(dnorm, lwd = 2, n = 1001, from = -5, to = 5,
      lty = "dashed",
      main = expression(paste(italic(t), " distributions and the standard normal distribution")),
      xlab = "x",
      ylab = "f(x)")

nvals <- c(2, 5, 15)
legend_labels <- c("N(0, 1)")
legend_col <- c(gray(0))
legend_lty <- c("dashed")

for (i in 1:length(nvals)) {
    n <- nvals[i]
    col <- gray(1-0.2*i)

    f <- function(x) { dt(x, n) }
    curve(f, lwd = 2, n = 1001, from = -5, to = 5, add = TRUE, col = col)
    legend_labels <- c(legend_labels, sprintf("t(%d)", n))
    legend_col <- c(legend_col, col)
    legend_lty <- c(legend_lty, "solid")
}

legend("topright", legend_labels, col = legend_col, lty = legend_lty)

Hypothesis testing

One-sample tests

mu <- 17
X <- c(16.4, 19.4, 17.8, 16.4, 18.5, 19.0, 19.5, 17.1, 16.1)
Xbar <- mean(X)
n <- length(X)
sigma <- 1.5 / sqrt(n)
z <- qnorm(c(0.025, 0.975), mean = Xbar, sd = sigma)
confint <- paste0(round(z, 2), "cm")
plot(X, runif(n), ylim = c(-0.5, 1.5), yaxt = "n",
     ylab = "", xlab = "Pencil length in cm",
     main = "Random sample of pencil lengths and 95% confidence interval")
abline(h = 0.5)
abline(v = mu, lty = "dashed")
text(mu, -0.45, expression(mu), adj = -0.5)
segments(z[1], -0.1, z[2], -0.1, col = "red")
segments(z[1], -0.05, z[1], -0.15, col = "red")
segments(z[2], -0.05, z[2], -0.15, col = "red")
segments(Xbar, -0.05, Xbar, -0.15, col = "red")
text(c(z, Xbar), 0.075, c(confint, paste0(Xbar, "cm")), col = "red")
text(Xbar, -0.45, expression(bar(X)), col = "red")

X <- c(16.4, 19.4, 17.8, 16.4, 18.5, 19.0, 19.5, 17.1, 16.1)
Xbar <- mean(X)
mu <- 17
n <- length(X)
sigma <- 1.5 / sqrt(n)
z <- qnorm(c(0.025, 0.975), mean = mu, sd = sigma)
p <- 1-pnorm(Xbar, mu, sigma)
x <- seq(15, 19, length = 1001)
y <- dnorm(x, mu, sigma)
plot(x, y, type = "l", xlab = "Pencil length in cm", ylab = "f(x)",
     main = expression(paste("Distribution of ", bar(X), " around ", mu, "=17cm under ", H[0],
                             " (", sigma, "=1.5cm and n=9)")))

iv <- x < z[1]
polygon(c(x[iv][1], x[iv], max(x[iv])),
        c(0, y[iv], 0),
        col = "#aaaaaa50", border = FALSE)
iv <- x > z[2]
polygon(c(x[iv][1], x[iv], max(x[iv])),
        c(0, y[iv], 0),
        col = "#aaaaaa50", border = FALSE)
abline(v = z, lty = "dashed", col = "#00000066")
abline(v = mu, lty = "dashed", col = "#000000DD")
text(mu, 0, expression(mu), adj = -0.25)

iv <- x >= Xbar
polygon(c(x[iv][1], x[iv], max(x[iv])),
        c(0, y[iv], 0),
        col = "#ff000050", border = FALSE)
abline(v = Xbar, lty = "dashed", col = "red")

text(Xbar, 0.15, substitute(P(bar(X) >= Xbar) == p,
                            list(Xbar=Xbar, p=sprintf("%.1f%%", p*100))),
     adj = -0.02, col = "red")
text(c(mu, z[2]), 0.035, c("95%", "2.5%"), adj = -0.25)
text(z[1], 0.035, "2.5%", adj = 1.25)

X <- c(16.4, 19.4, 17.8, 16.4, 18.5, 19.0, 19.5, 17.1, 16.1)
Xbar <- mean(X)
mu <- 17
n <- length(X)
sigma <- 1.5 / sqrt(n)
x <- seq(15, 19, length = 1001)
y <- dnorm(x, mu, sigma)
z <- qnorm(c(0.025, 0.975), mean = Xbar, sd = sigma)
konfint <- paste0(round(z, 2), "cm")

plot(x, y, type = "l", xlab = "Pencil length in cm", ylab = "f(x)",
     main = expression(paste("Distribution of ", bar(X), " around ", mu, "=17cm under ", H[0],
                             " (", sigma, "=1.5cm and n=9)")))
abline(v = mu, lty = "dashed")
text(mu, 0.1, expression(mu), adj = -0.5)
segments(z[1], -0.015, z[2], -0.015, col = "red")
segments(z[1], -0.01, z[1], -0.02, col = "red")
segments(z[2], -0.01, z[2], -0.02, col = "red")
segments(Xbar, -0.01, Xbar, -0.02, col = "red")
text(c(z, Xbar), 0.025, c(konfint, paste0(Xbar, "cm")), col = "red")
text(Xbar, 0.07, expression(bar(X)), col = "red")

x <- seq(-4, 4, length = 1001)
y <- dnorm(x)
plot(x, y, type = "l", xlab = "", ylab = "Density",
     main = expression(paste("One-sided test: Alternative hypothesis is ", mu < mu[0])),
     xaxt = "n")
p <- 0.3
z <- qnorm(p)
abline(v = 0, lty = "dashed")
segments(z, 0, z, dnorm(z), lty = "dashed")
axis(1, c(0, z), labels = c(expression(mu[0]), expression(bar(x))))

iv <- x < z
polygon(c(x[iv][1], x[iv], max(x[iv])),
        c(0, y[iv], 0),
        col = "#aaaaaa50", border = FALSE)
text(-1.5, 0.01, expression(P(bar(X) <= bar(x))))

plot(x, y, type = "l", xlab = "", ylab = "Density",
     main = expression(paste("One-sided test: Alternative hypothesis is ", mu > mu[0])),
     xaxt = "n")
abline(v = 0, lty = "dashed")
segments(z, 0, z, dnorm(z), lty = "dashed")
axis(1, c(0, z), labels = c(expression(mu[0]), expression(bar(x))))

iv <- x > z
polygon(c(x[iv][1], x[iv], max(x[iv])),
        c(0, y[iv], 0),
        col = "#aaaaaa50", border = FALSE)
text(1.3, 0.01, expression(P(bar(X) >= bar(x))))

x <- seq(-4, 4, length = 1001)
y <- dnorm(x)
plot(x, y, type = "l", xlab = "", ylab = "Density",
     main = expression(paste("Two-sided test: Sample mean is ", bar(x) < mu[0])),
     xaxt = "n")
p <- 0.3
z <- qnorm(p)
abline(v = 0, lty = "dashed")
segments(z, 0, z, dnorm(z), lty = "dashed")
axis(1, c(0, z), labels = c(expression(mu[0]), expression(bar(x))))

iv <- x < z
polygon(c(x[iv][1], x[iv], max(x[iv])),
        c(0, y[iv], 0),
        col = "#aaaaaa50", border = FALSE)
text(-1.5, 0.01, expression(P(bar(X) <= bar(x))))

x <- seq(-4, 4, length = 1001)
y <- dnorm(x)
plot(x, y, type = "l", xlab = "", ylab = "Density",
     main = expression(paste("Two-sided test: Sample mean is ", bar(x) > mu[0])),
     xaxt = "n")
abline(v = 0, lty = "dashed")
z <- qnorm(1-p)
segments(z, 0, z, dnorm(z), lty = "dashed")
axis(1, c(0, z), labels = c(expression(mu[0]), expression(bar(x))))

iv <- x > z
polygon(c(x[iv][1], x[iv], max(x[iv])),
        c(0, y[iv], 0),
        col = "#aaaaaa50", border = FALSE)
text(1.3, 0.01, expression(P(bar(X) >= bar(x))))

n <- 100
p <- 0.1
k <- 12
p_less_k <- round(pbinom(11, 100, 0.1), 3)
p_ge_k <- 1 - p_less_k
subst_params <- list(p_less_k=p_less_k, p_ge_k=p_ge_k)
x <- 0:100
y <- dbinom(x, n, p)

barcols <- ifelse(x >= k, "orange", "gray")
barplot(y ~ x, axis.lty = 1, col = barcols,
        xlab = "Number x of waste items",
        ylab = "Probability P(X=x)",
        main = "Probability of number of waste items under the null hypothesis")
legend("topright",
       c(expression(paste("Number of waste items " < 12)),
         expression(paste("Number of waste items " >= 12))),
       fill = c("gray", "orange"))
text(34, 0.04, substitute(P(X >= 12) == p, list(p = p_ge_k)), adj = 0)
arrows(33, 0.039, 20, 0.03, length = 0.1)

x <- 0:100
y <- pbinom(x, n, p)

barcols <- ifelse(x == k - 1, "red", "gray")
barplot(y, names.arg = as.character(x), axis.lty = 1,
        col = barcols,
        xlab = "Number x of waste items",
        ylab = expression(paste("Probability ", P(X <= x))),
        main = "Probability of number of waste items under the null hypothesis")

legend("topright",
       expression(paste("Probability ", P(X <= 11))),
       fill = "red")

Two-sample tests

pgreduced <- PlantGrowth[PlantGrowth$group != "trt1", ]
pgreduced$group <- factor(pgreduced$group)
boxplot(weight ~ group, data = pgreduced,
        xlab = "Group", ylab = "Dried weight of plants in kg",
        main = "Plant growth experiment")

ctrl <- pgreduced$group == "ctrl"
w_ctrl <- pgreduced$weight[ctrl]
w_trt2 <- pgreduced$weight[!ctrl]

xbar <- mean(w_trt2)
ybar <- mean(w_ctrl)
nx <- length(w_trt2)
ny <- length(w_ctrl)
se_x <- sd(w_trt2) / sqrt(nx)
se_y <- sd(w_ctrl) / sqrt(ny)

x <- seq(4, 6.5, length = 1001)
y_trt2 <- dnorm(x, mean = xbar, sd = se_x)
plot(w_trt2, runif(nx, 0, 0.1), col = "darkred", xlim = c(min(x), max(x)), ylim = c(0, 3),
     xlab = "Dried weight of plants in kg", ylab = "Density",
     main = "Plant growth experiment\nDistribution of sample means")
points(w_ctrl, runif(ny, 0, 0.1), col = "blue")
lines(x, y_trt2, col = "darkred")
abline(v = xbar, lty = "dashed", col = "darkred")
text(xbar, 0.15, substitute(bar(X) == xbar, list(xbar = paste0(xbar, "kg"))), pos = 4, col = "darkred")
y_ctrl <- dnorm(x, mean = ybar, sd = se_y)
lines(x, y_ctrl, col = "blue")
abline(v = ybar, lty = "dashed", col = "blue")
text(ybar, 0.15, substitute(bar(Y) == ybar, list(ybar = paste0(ybar, "kg"))), pos = 2, col = "blue")
legend("topright", c("X: treatment", "Y: control"),
       lty = "solid", col = c("darkred", "blue"))

w_ttest_res <- t.test(w_trt2, w_ctrl, alternative = "greater")
se <- w_ttest_res$stderr
d <- w_trt2 - w_ctrl
d_tscale <- d / se
dbar <- mean(d)
dbar_tscale <- dbar / se
xmin <- floor(min(d_tscale)) - 1
xmax <- ceiling(max(d_tscale))

x <- seq(xmin, xmax, length.out = 1001)
y <- dt(x, w_ttest_res$parameter)
plot(x, y, type = "l", xaxt = "n",
     xlab = expression(paste("Difference ", bar(X) - bar(Y), " in kg")),
     ylab = "Density",
     main = expression(paste("Distribution of the difference ", bar(X) - bar(Y),
                             " under ", H[0], ": ", mu[X] - mu[Y] <= 0)))
xaxt_labels <- seq(floor(xmin*se), ceiling(xmax*se), by = 0.5)
xaxt_labels_pos <- xaxt_labels/se
axis(1, at = xaxt_labels_pos, labels = xaxt_labels)
points(d_tscale, runif(length(d_tscale), 0, 0.01), col = "red")
abline(v = 0, lty = "dashed")
text(0, 0.025, expression(mu[X] - mu[Y] == 0), pos = 4)
abline(v = dbar_tscale, lty = "dashed", col = "red")
iv <- x >= dbar_tscale
polygon(c(x[iv][1], x[iv], max(x[iv])),
        c(0, y[iv], 0),
        col = "#ff000080", border = FALSE)
text(dbar_tscale + 0.25, 0.025, substitute(P(bar(X) - bar(Y) > dbar) == p,
                                           list(dbar = paste0(dbar, "kg"),
                                                p = round(w_ttest_res$p.value, 4))),
     pos = 4, col = "red")

x <- seq(xmin, xmax, length.out = 1001)
y <- dt(x, w_ttest_res$parameter)
plot(x, y, type = "l",
     xlab = "t",
     ylab = "Density",
     main = expression(paste(italic(t), " distribution under ", H[0], ": ", mu[X] - mu[Y] <= 0)))
points(d_tscale, runif(length(d), 0, 0.01), col = "red")
abline(v = 0, lty = "dashed")
text(0, 0.025, expression(mu[X] - mu[Y] == 0), pos = 4)
abline(v = dbar_tscale, lty = "dashed", col = "red")
iv <- x >= dbar_tscale
polygon(c(x[iv][1], x[iv], max(x[iv])),
        c(0, y[iv], 0),
        col = "#ff000080", border = FALSE)
text(dbar_tscale + 0.25, 0.025, substitute(1-P(t < tval) == pval,
                                           list(dbar = paste0(dbar, "kg"),
                                                tval = round(w_ttest_res$statistic, 4),
                                                pval = round(w_ttest_res$p.value, 4))),
     pos = 4, col = "red")

alpha <- 0.05
m <- 1:25
y <- 1-(1-alpha)^m

plot(m, y,
     main = substitute(paste("Error rate for multiple tests with ", alpha==a), list(a=alpha)),
     xlab = "Number of tests", ylab = "Probability of type I error")
lines(m, y)